%%% MATLAB script Read_ARB_XML for TRFragCalc %%%
% @ Richard L. Hahnke, 2013

function [ARB_branch database export_date] = Read_ARB_XML(filename)

%%% Description
%{
Reads ARB files which were exported in XML format.

Uses the MATLAB function xmlread which reads XML files
 
regexp, regexpi MATLAB function searches strings in strings (e.g. probe-string in sequence-string)
%}

%{
xmlreadParse XML document and return Document Object Model node

Syntax
DOMnode = xmlread(filename)

DescriptionDOMnode = xmlread(filename) reads a URL or filename and returns a Document Object Model node
representing the parsed document. The filename input is a string enclosed in single quotes. The node can be 
manipulated by using standard DOM functions.



if XML file is in MATLAB as variable
MATLAB returns:
DOMnode =
 
[#document: null]

%DOMnode = xmlread('CyanobacAllfm.xml')
%}

%{
This function parses an XML file using methods of the DOM node returned
by xmlread, and stores the data it reads in the Name, Attributes, Data, and Children fields of a MATLAB structure:function theStruct = parseXML(filename)
%}
% PARSEXML Convert XML file to a MATLAB structure.

%% load xlm file
%filename = 'CyanobacAllfm.xml';
try
   tree = xmlread(filename);
catch
   error('Failed to read XML file %s.',filename);
end

%% Recurse over child nodes.
% This could run into problems with very deeply nested trees.

try
   ARB_branch = parseChildNodes(tree);
catch
   error('Unable to parse XML file %s.',filename);
end

%% reduce data of export file
% (every second Child (starting with Child(1)) is without information)

% only used Childs (species)
species = ARB_branch.Children;
numberOfChildren = length(ARB_branch.Children);
species(1:2:numberOfChildren)=[];

% reduce primary data fields
for i=1:length(species)
    species(i).Name=species(i).Attributes.Value;
end

% remove empty primary data fields
species = rmfield(species,'Data');
species = rmfield(species,'Attributes');

% only used Childs (ARB fields, attributes) of each species
for i=1:length(species)
    species(i).Children(1:2:length(species(i).Children))=[];
end

% important fields into new struct
% (you can keep further fields of each sequences by copy and paste of the
% rows between % *** %)


%{
look at:  species(1).Children(6).Name

1 --> <acc>AY125384</acc>
2 --> <ALIGNMENT name="16s">
            <data>GUG..GG</data>
        </ALIGNMENT>
3 --> <aligned_slv>2008-09-30 19:13:41</aligned_slv>
4 --> <description>Uncultured Synechococcus sp. clone A315024 16S ribosomal RNA gene.</description>
5 --> <full_name>uncultured Synechococcus sp.</full_name>
6 --> 
7 --> 

%}

for i=1:length(species)
    for j=1:length(species(i).Children)
        
        if(length(species(i).Children(j).Name) == 3)  
            if(species(i).Children(j).Name == 'acc')
                newSpecies(i).Children(1).Name = species(i).Children(j).Name;
                newSpecies(i).Children(1).Data = cellstr(species(i).Children(j).Children(1).Data);
            end
        end
        
        if(length(species(i).Children(j).Name) == 9)
            if(species(i).Children(j).Name == 'ALIGNMENT')
                newSpecies(i).Children(2).Name = 'sequence';
                newSpecies(i).Children(2).Data = species(i).Children(j).Children(2).Children.Data;
            end
        end
        %filename = 'CyanobacAllfm.xml';
        if(length(species(i).Children(j).Name) == 9)
            if(species(i).Children(j).Name == 'full_name')
                newSpecies(i).Children(3).Name = species(i).Children(j).Name;
                newSpecies(i).Children(3).Data = cellstr(species(i).Children(j).Children.Data);
            end
        end
        
        if(length(species(i).Children(j).Name) == 11)
            if(species(i).Children(j).Name == 'description')
                newSpecies(i).Children(4).Name =  species(i).Children(j).Name;
                newSpecies(i).Children(4).Data = species(i).Children(j).Children.Data;
            end
        end
        
        if(length(species(i).Children(j).Name) == 7) %length(!?oldName?!)
            if(species(i).Children(j).Name == 'tax_slv')
                newSpecies(i).Children(5).Name = 'taxonomy'; % NEW defenition
                newSpecies(i).Children(5).Data = species(i).Children(j).Children.Data; %depends
            end
        end
        
        % *** %
        %{
        if(length(species(i).Children(j).Name) == ??) %length(!?oldName?!)
            if(species(i).Children(j).Name == '?oldName?')
                newSpecies(i).Children(??).Name = '?newName?'; % NEW defenition
                newSpecies(i).Children(??).Data = species(i).Children(j).Children(2).Children.Data; %depends
            end
        end
        %}
        % *** %
    end
end


%% Replace "U" by "T" in all sequences
for iSpecies=1:length(newSpecies)
    newSpecies(iSpecies).Children(2).Data = regexprep(newSpecies(iSpecies).Children(2).Data, 'U', 'T');
end


%% exchange 'Children' with 'species' of reduced data
ARB_branch.species = newSpecies;
ARB_branch =  rmfield(ARB_branch, 'Children');


%% finishing

% name of branch by removing the ".xml" from the filename
idx = regexpi(filename,'/');
filename(1:idx(length(idx))) = [];
filename = regexprep(filename, '.xml', '');
filename = regexprep(filename, '_', ' ');
ARB_branch.Name = filename;

database = ARB_branch.Attributes(1).Value
export_date = ARB_branch.Attributes(2).Value